Load in the dataset “zombies.csv” from my GitHub repo at https://github.com/fuzzyatelin/fuzzyatelin.github.io/tree/master/AN597_Fall19/. This data includes the first and last name and gender of the entire population of 1000 people who have survived the zombie apocalypse and are now ekeing out an existence somewhere on the East Coast, along with several other variables (height, weight, age, number of years of education, number of zombies they have killed, and college major see here for info on important post-zombie apocalypse majors ***
#Loading the dataset “zombies.csv”
library(curl)
f <- f <- curl("https://raw.githubusercontent.com/fuzzyatelin/fuzzyatelin.github.io/master/AN597_Fall19/zombies.csv")
d <- read.csv(f, header = TRUE, sep = ",", stringsAsFactors = FALSE)
head(d)
## id first_name last_name gender height weight zombies_killed
## 1 1 Sarah Little Female 62.88951 132.0872 2
## 2 2 Mark Duncan Male 67.80277 146.3753 5
## 3 3 Brandon Perez Male 72.12908 152.9370 1
## 4 4 Roger Coleman Male 66.78484 129.7418 5
## 5 5 Tammy Powell Female 64.71832 132.4265 4
## 6 6 Anthony Green Male 71.24326 152.5246 1
## years_of_education major age
## 1 1 medicine/nursing 17.64275
## 2 3 criminal justice administration 22.58951
## 3 1 education 21.91276
## 4 6 energy studies 18.19058
## 5 3 logistics 21.10399
## 6 4 energy studies 21.48355
Calculate the population mean and standard deviation for each quantitative random variable (height, weight, age, number of zombies killed, and years of education). NOTE: You will not want to use the built in var() and sd() commands as these are for samples. ***
summary(d)
## id first_name last_name gender
## Min. : 1.0 Length:1000 Length:1000 Length:1000
## 1st Qu.: 250.8 Class :character Class :character Class :character
## Median : 500.5 Mode :character Mode :character Mode :character
## Mean : 500.5
## 3rd Qu.: 750.2
## Max. :1000.0
## height weight zombies_killed years_of_education
## Min. :54.15 Min. : 90.29 Min. : 0.000 Min. :0.000
## 1st Qu.:64.68 1st Qu.:131.81 1st Qu.: 2.000 1st Qu.:2.000
## Median :67.50 Median :142.89 Median : 3.000 Median :3.000
## Mean :67.63 Mean :143.91 Mean : 2.992 Mean :2.996
## 3rd Qu.:70.38 3rd Qu.:156.28 3rd Qu.: 4.000 3rd Qu.:4.000
## Max. :80.53 Max. :210.79 Max. :11.000 Max. :8.000
## major age
## Length:1000 Min. :10.66
## Class :character 1st Qu.:18.07
## Mode :character Median :19.90
## Mean :20.05
## 3rd Qu.:21.94
## Max. :29.59
attach(d)
mean(height)
## [1] 67.6301
Height_sigma<- sqrt(sum((height - mean(height))^2)/length(height))
Height_sigma
## [1] 4.30797
mean(weight)
## [1] 143.9075
weight_sigma<- sqrt(sum((weight - mean(weight))^2)/length(weight))
weight_sigma
## [1] 18.39186
mean(age)
## [1] 20.04696
age_sigma<- sqrt(sum((age - mean(age))^2)/length(age))
age_sigma
## [1] 2.963583
mean(zombies_killed)
## [1] 2.992
zombiesKilled_sigma<- sqrt(sum((zombies_killed - mean(zombies_killed))^2)/length(zombies_killed))
zombiesKilled_sigma
## [1] 1.747551
mean(years_of_education)
## [1] 2.996
yearsofedu_sigma<- sqrt(sum((years_of_education - mean(years_of_education))^2)/length(years_of_education))
yearsofedu_sigma
## [1] 1.675704
Use {ggplot} to make boxplots of each of these variables by gender. ***
library(ggplot2)
#Height by Gender
boxplot(height ~ gender, main="Boxplot of Height by gender", ylab = "Height (inches)", names=c("Females", "Males"))
#Weight by Gender
boxplot(weight ~ gender, main="Boxplot of Weight by Gender", ylab = "Weight (lbs)", names=c("Females", "Males"))
#Age by Gender
boxplot(age ~ gender, main="Boxplot of Age by Gender", ylab = "Age", names=c("Females", "Males"))
#Number of Zombies Killed by Gender
boxplot(zombies_killed ~ gender,main="Boxplot of Zombies Killed by Gender", ylab = "Zombies Killed", names=c("Females", "Males"))
#Number of Years of Education by Gender
boxplot(years_of_education ~ gender,main="Boxplot of Years of Education by Gender", ylab = "Years of Education", names=c("Females", "Males"))
Use {ggplot} to make scatterplots of height and weight in relation to age. Do these variables seem to be related? In what way? ***
library(ggplot2)
attach(d)
## The following objects are masked from d (pos = 4):
##
## age, first_name, gender, height, id, last_name, major, weight,
## years_of_education, zombies_killed
data("d")
## Warning in data("d"): data set 'd' not found
str(d)
## 'data.frame': 1000 obs. of 10 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ first_name : chr "Sarah" "Mark" "Brandon" "Roger" ...
## $ last_name : chr "Little" "Duncan" "Perez" "Coleman" ...
## $ gender : chr "Female" "Male" "Male" "Male" ...
## $ height : num 62.9 67.8 72.1 66.8 64.7 ...
## $ weight : num 132 146 153 130 132 ...
## $ zombies_killed : int 2 5 1 5 4 1 0 4 9 2 ...
## $ years_of_education: int 1 3 1 6 3 4 4 0 3 3 ...
## $ major : chr "medicine/nursing" "criminal justice administration" "education" "energy studies" ...
## $ age : num 17.6 22.6 21.9 18.2 21.1 ...
ggplot() + geom_point(data = d, aes(x = age, y = height, color = gender))
plot(age, height, main = "Scatterplot Age and Height", xlab = "Age", ylab = "height")
plot(x= log(age), y= log(height))
ggplot() + geom_point(data = d, aes(x = age, y = weight, color = gender))
plot(age, weight, main = "Scatterplot Age and Weight", xlab = "Age", ylab = "weight")
plot(x= log(age), y= log(weight))
Using histograms and Q-Q plots, check whether the quantitative variables seem to be drawn from a normal distribution. Which seem to be and which do not (hint: not all are drawn from the normal distribution)? For those that are not normal, can you determine from which common distribution they are drawn? ***
qqnorm(height, pch = 1, frame = FALSE, main="Normal QQ plot height") #qq-plot: This function help us to observe a good fit of the straight line (qqnorm, qqline)
qqline(height,col="steelblue", lwd = 2)
hist(height)
qqnorm(weight, pch = 1, frame = FALSE, main="Normal QQ plot weight")
qqline(weight,col="steelblue", lwd = 2)
hist(weight)
qqnorm(age, pch = 1, frame = FALSE, main="Normal QQ plot age")
qqline(weight,col="steelblue", lwd = 2)
hist(age)
qqnorm(zombies_killed, pch = 1, frame = FALSE, main="Normal QQ plot zombies killed")
qqline(zombies_killed,col="steelblue", lwd = 2)
hist(zombies_killed)
qqnorm(years_of_education, pch = 1, frame = FALSE, main="Normal QQ plot years of education")
qqline(years_of_education,col="steelblue", lwd = 2)
hist(years_of_education)
Now use the sample() function to sample ONE subset of 30 zombie survivors (without replacement) from this population and calculate the mean and sample standard deviation for each variable. Also estimate the standard error for each variable, and construct the 95% confidence interval for each mean. Note that for the variables that are not drawn from the normal distribution, you may need to base your estimate of the CIs on slightly different code than for the normal… ***
he<- sample(height, 30, replace = FALSE, prob = NULL)
hist(height)
boxplot(height)
t.test(height, mu=67, alt="two.sided", conf=0.95)
##
## One Sample t-test
##
## data: height
## t = 4.6229, df = 999, p-value = 4.279e-06
## alternative hypothesis: true mean is not equal to 67
## 95 percent confidence interval:
## 67.36263 67.89756
## sample estimates:
## mean of x
## 67.6301
mean(he) #he=sample(height, 30, replace = FALSE, prob = NULL)
## [1] 68.09065
sd(he)
## [1] 4.56963
wei<- sample(weight, 30, replace = FALSE, prob = NULL)
hist(weight)
boxplot(weight)
t.test(height, mu=140, alt="two.sided", conf=0.95)
##
## One Sample t-test
##
## data: height
## t = -530.97, df = 999, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 140
## 95 percent confidence interval:
## 67.36263 67.89756
## sample estimates:
## mean of x
## 67.6301
mean(wei)
## [1] 143.1388
sd(wei)
## [1] 15.83695
ag<- sample(age, 30, replace = FALSE, prob = NULL)
hist(age)
boxplot(age)
t.test(height, mu=20, alt="two.sided", conf=0.95)
##
## One Sample t-test
##
## data: height
## t = 349.46, df = 999, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 20
## 95 percent confidence interval:
## 67.36263 67.89756
## sample estimates:
## mean of x
## 67.6301
mean(ag)
## [1] 20.05523
sd(ag)
## [1] 3.61837
Now draw 99 more random samples of 30 zombie apocalypse survivors, and calculate the mean for each variable for each of these samples. Together with the first sample you drew, you now have a set of 100 means for each he (each based on 30 observations), which constitutes a sampling distribution for each variable. What are the means and standard deviations of this distribution of means for each variable? How do the standard deviations of means compare to the standard errors estimated in [5]? What do these sampling distributions look like (a graph might help here)? Are they normally distributed? What about for those variables that you concluded were not originally drawn from a normal distribution? ***
Challenges: I could not figure out how to solve question 6 and I also missed to share this for peer commentary because I did not finish the homework ontime and I have been working on the homework during the weekend. I will try to continue working on question 6 and upload a revised version later tonight.